#chooseCRANmirror(ind=70)
#install.packages("readxl")
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(ggplot2)
library(stringr)
library(dplyr)
library(textclean)
library(stopwords)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(forcats)

Read Excel Freeland Bunker Journal Transcription (1873-1874)

journal_1873 <- read_excel("data/journal_1873.xlsx")
journal_1874 <- read_excel("data/journal_1874.xlsx")
names_journal_1873 <- read_excel("data/names_journal_1873.xlsx")
names_journal_1874 <- read_excel("data/names_journal_1874.xlsx")
journal_1873 <- journal_1873 %>% 
  rename(month = date)
#I corrected some typos in journal_entry and in the letter column when create the names_journal_187, which is why I am keeping these columns from names_journal_1873
 journal_1873 <- left_join(names_journal_1873, journal_1873, by = c("date_mdy", "month", "journal_entry", "letter"))
#I corrected some typos in journal_entry and in the letter column when create the names_journal_1874, which is why I am keeping these columns from names_journal_1874
 journal_1874 <- left_join(names_journal_1874, journal_1874, by = c("date_mdy", "month", "journal_entry", "letter"))

Exploratory Visualizations

Wind Speed 1873

wind_speed_1873 <- journal_1873 %>%
  select(date_mdy,
         month,
         wind_speed_am, 
         wind_speed_pm, 
         wind_speed_night) %>% 
  mutate(year = "1873") %>% 
  pivot_longer(cols = starts_with("wind_speed"), 
               names_to = "period", 
               values_to = "wind_speed") %>%
  mutate(period = case_when(
    period == "wind_speed_am" ~ "am",
    period == "wind_speed_pm" ~ "pm", 
    period == "wind_speed_night" ~ "night")) %>% 
  separate_rows(wind_speed, sep = ",") %>% 
  mutate(category = case_when(
    wind_speed %in% c("blustering", "very blustering") ~ "blustering",
    wind_speed %in% c("breezy", "good breeze", "very fresh breeze", "breezed up", "fresh breeze", "heavy breeze", "smart breeze", "strong breeze") ~ "breeze",
    wind_speed %in% c("blowing very heavy", "fresh blow", "heavy blow", "very heavy blow", "blowy", "blowing", "blowing hard", "blowing very hard") ~ "blow",
    wind_speed %in% c("fresh gale", "gale") ~ "gale",
    wind_speed %in% c("moderate", "quite moderate", "very moderate") ~ "moderate",
    wind_speed %in% c("pleasant", "quite pleasant", "very pleasant") ~ "pleasant",
    wind_speed %in% c("calm", "perfectly calm") ~ "calm",
    wind_speed %in% c("scant wind", "heavy winds") ~ "wind",
    wind_speed %in% c("rough", "squall", "strong", "very light", "heavy", "baffling", "a light air", "variable", "fair") ~ "other intensities"
  ))
wind_speed_1873 %>%
  mutate(wind_speed = ifelse(wind_speed == "NA", NA, wind_speed)) %>%
  drop_na(wind_speed, period, category) %>%
  group_by(wind_speed, period, category) %>%
  summarize(n = n()) %>%
  mutate(period = factor(period, levels = c("am", "pm", "night"))) %>%
  ggplot(aes(x = wind_speed, 
             y = n, 
             fill = category)) +
  geom_bar(stat = "identity", 
           position = "dodge") +
  labs(title = "Wind Vocabulary Frequency by Period of the Day 1873",
       x = "Wind Speed Vocabulary", 
       y = "Frequency", 
       fill = "Category") +
  theme_minimal() +
  coord_flip() +
  facet_wrap(~ period)
## `summarise()` has grouped output by 'wind_speed', 'period'. You can override
## using the `.groups` argument.

  wind_speed_1873 %>%
  mutate(wind_speed = ifelse(wind_speed == "NA", NA, wind_speed)) %>%
  drop_na(wind_speed, period, category) %>%
  group_by(wind_speed, period, category) %>%
  summarize(n = n()) %>%
  mutate(period = factor(period, levels = c("am", "pm", "night"))) %>%
  ggplot(aes(x = wind_speed, 
             y = n, 
             fill = category)) +
  geom_bar(stat = "identity", 
           position = "dodge") +
  labs(title = "General Wind Vocabulary Frequency 1873",
       x = "Wind Speed Vocabulary", 
       y = "Frequency", 
       fill = "Category") +
  theme_minimal(base_size = 7) +
  coord_polar() +
  scale_y_log10() # to better see smaller frequencies
## `summarise()` has grouped output by 'wind_speed', 'period'. You can override
## using the `.groups` argument.

wind_speed_1873 %>%
  mutate(wind_speed = ifelse(wind_speed == "NA", NA, wind_speed)) %>%
  drop_na(wind_speed, period, category) %>%
  group_by(wind_speed, period, category) %>%
  summarize(n = n()) %>%
  mutate(period = factor(period, levels = c("am", "pm", "night"))) %>%
  ggplot(aes(x = "", y = n, fill = category)) +
  geom_bar(stat = "identity", width = 1) + # 1 for pie chart effect
  labs(x = NULL, y = "Frequency", fill = "Category") +
  theme_minimal() +
  coord_polar("y", start = 0) +
  theme_void() +
  labs(title = " 1873 Wind Vocabulary by Category")
## `summarise()` has grouped output by 'wind_speed', 'period'. You can override
## using the `.groups` argument.

Wind Speed Comparison 1873-1874

wind_speed_1874 <- journal_1874 %>%
  select(date_mdy, 
         month,
         wind_speed_am, 
         wind_speed_pm, 
         wind_speed_night) %>% 
  mutate(year = 1874) %>% 
  pivot_longer(cols = starts_with("wind_speed"), 
               names_to = "period", 
               values_to = "wind_speed") %>%
  mutate(period = case_when(
    period == "wind_speed_am" ~ "am",
    period == "wind_speed_pm" ~ "pm", 
    period == "wind_speed_night" ~ "night")) %>% 
  separate_rows(wind_speed, sep = ",") %>% 
  mutate(category = case_when(
    wind_speed %in% c("blustering", "very blustering") ~ "blustering",
    wind_speed %in% c("breezy", "good breeze", "very fresh breeze", "breezed up", "fresh breeze", "heavy breeze", "smart breeze", "strong breeze") ~ "breeze",
    wind_speed %in% c("blowing very heavy", "fresh blow", "heavy blow", "very heavy blow", "blowy", "blowing", "blowing hard", "blowing very hard") ~ "blow",
    wind_speed %in% c("fresh gale", "gale") ~ "gale",
    wind_speed %in% c("moderate", "quite moderate", "very moderate") ~ "moderate",
    wind_speed %in% c("pleasant", "quite pleasant", "very pleasant") ~ "pleasant",
    wind_speed %in% c("calm", "perfectly calm") ~ "calm",
    wind_speed %in% c("scant wind", "heavy winds") ~ "wind",
    wind_speed %in% c("rough", "squall", "strong", "very light", "heavy", "baffling", "a light air", "variable", "fair") ~ "other intensities"
  ))
combined_wind_speed<- rbind(wind_speed_1873, wind_speed_1874)
combined_wind_speed %>%
  mutate(wind_speed = ifelse(wind_speed == "NA", NA, wind_speed)) %>%
  drop_na(wind_speed, period, category, year) %>%
  group_by(wind_speed, category, year) %>%
  summarize(n = n()) %>%
  ggplot(aes(x = wind_speed, y = n, fill = category)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Yearly Comparison of Wind Speed Vocabulary",
       subtitle = "Colored by Overarching Categories in 1873 and 1874",
       x = "Wind Speed Vocabulary", 
       y = "Frequency", 
       fill = "Category") +
  theme_minimal() +
  coord_flip() +
  facet_wrap(~ year)
## `summarise()` has grouped output by 'wind_speed', 'category'. You can override
## using the `.groups` argument.

combined_wind_speed %>%
  mutate(wind_speed = ifelse(wind_speed == "NA", NA, wind_speed)) %>%
  drop_na(wind_speed, period, category, year) %>%
  group_by(category, month, year) %>%
  summarise(frequency = n()) %>%
  ggplot(aes(x = category, y = frequency, fill = as.factor(year))) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Monthly Comparison of Wind Speed Categories 1873-1874",
       x = "Category",
       y = "Frequency",
       fill = "Year") +
  facet_wrap(~ month) +
  coord_flip()
## `summarise()` has grouped output by 'category', 'month'. You can override using
## the `.groups` argument.

Wind Directions 1873

convert_wind_direction_to_degrees <- function(wind_direction) {
  degrees <- case_when(
    wind_direction == "N" ~ 0,
    wind_direction == "NNE" ~ 22.5,
    wind_direction == "NE" ~ 45,
    wind_direction == "ENE" ~ 67.5,
    wind_direction == "E" ~ 90,
    wind_direction == "ESE" ~ 112.5,
    wind_direction == "SE" ~ 135,
    wind_direction == "SSE" ~ 157.5,
    wind_direction == "S" ~ 180,
    wind_direction == "SSW" ~ 202.5,
    wind_direction == "SW" ~ 225,
    wind_direction == "WSW" ~ 247.5,
    wind_direction == "W" ~ 270,
    wind_direction == "WNW" ~ 292.5,
    wind_direction == "NW" ~ 315,
    wind_direction == "NNW" ~ 337.5,
    TRUE ~ NA_real_  # Return NA if the wind direction is not recognized
  )
  return(degrees)
}
wind_direction_1873_long <- journal_1873 %>% 
  pivot_longer(cols = starts_with("wind_direction"), 
               names_to = "period", 
               values_to = "wind_direction") %>% 
  separate_rows(wind_direction, sep = ", ") %>% 
  mutate(period = case_when(
    period == "wind_direction_am" ~ "am",
    period == "wind_direction_pm" ~ "pm", 
    period == "wind_direction_night" ~ "night"
  ))
wind_direction_1873_long <- wind_direction_1873_long %>%
  mutate(
    wind_degrees = case_when(
      period == "am" ~ convert_wind_direction_to_degrees(wind_direction),
      period == "pm" ~ convert_wind_direction_to_degrees(wind_direction),
      period == "night" ~ convert_wind_direction_to_degrees(wind_direction)
    )) %>% 
  select(date_mdy, month, wind_direction, period, wind_degrees) %>% 
  mutate(year = "1873")
# Function to convert degrees to radians for polar coordinates
to_radians <- function(degrees) {
  return((degrees - 90) * pi / 180)
}

wind_direction_1873_long %>% 
  ggplot(aes(x = to_radians(wind_degrees))) +
  geom_bar(aes(fill = stat(count)), 
           bins = 16, 
           color = "black") +  
  scale_fill_viridis_c(option = "plasma", name = "Frequency") + 
  geom_text(aes(x = to_radians(wind_degrees), 
                y = 150, label = wind_direction),
            size = 4, fontface = "bold", 
            color = "black") +  
  labs(title = "Wind Directions Mentionned, 1873", 
       "Only Directions Explicitely Specified by Period",
       x = NULL, 
       y = NULL) +
  theme_minimal() +
  theme(axis.text = element_blank()) +
  coord_polar() +
  scale_y_continuous(name = "Frequency", 
                     trans = "log10")   # Frequency scale (logarithmic)
## Warning in geom_bar(aes(fill = stat(count)), bins = 16, color = "black"):
## Ignoring unknown parameters: `bins`
## Warning: `stat(count)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 761 rows containing non-finite values (`stat_count()`).
## Warning: Removed 761 rows containing missing values (`geom_text()`).

wind_direction_1873_long %>% 
  mutate(period = factor(period, levels = c("am", "pm", "night"))) %>%
  ggplot(aes(x = to_radians(wind_degrees))) +
  geom_bar(aes(fill = stat(count)), 
           bins = 16, 
           color = "black") +  
  scale_fill_viridis_c(option = "plasma", name = "Frequency") + 
  geom_text(aes(x = to_radians(wind_degrees), 
                y = 150, label = wind_direction),
            size = 4, fontface = "bold", 
            color = "black") +  
  labs(title = "Wind Directions Mentionned by Period of the Day, 1873", 
       x = NULL, 
       y = NULL) +
  theme_minimal() +
  theme(axis.text = element_blank()) +
  coord_polar() +
  scale_y_continuous(name = "Frequency", 
                     trans = "log10") +  # Frequency scale (logarithmic)
  facet_wrap(~ period)
## Warning in geom_bar(aes(fill = stat(count)), bins = 16, color = "black"):
## Ignoring unknown parameters: `bins`
## Warning: Removed 761 rows containing non-finite values (`stat_count()`).
## Warning: Removed 761 rows containing missing values (`geom_text()`).

wind_direction_1873_long %>% 
  ggplot(aes(x = to_radians(wind_degrees))) +
  geom_bar(aes(fill = stat(count)), 
           bins = 16, 
           color = "black") + 
  scale_fill_viridis_c(option = "plasma", name = "Frequency") + 
  geom_text(aes(x = to_radians(wind_degrees), 
                y = 150, label = wind_direction),
            size = 4, fontface = "bold", 
            color = "black") +  
  labs(title = "Wind Directions Mentioned by Month, 1873",
       x = NULL, 
       y = NULL) +
  theme_minimal() +
  theme(axis.text = element_blank()) +
  coord_polar() +
  scale_y_continuous(name = "Frequency", 
                     trans = "log10") +  # Frequency scale (logarithmic)
  facet_wrap(~ month)
## Warning in geom_bar(aes(fill = stat(count)), bins = 16, color = "black"):
## Ignoring unknown parameters: `bins`
## Warning: Removed 761 rows containing non-finite values (`stat_count()`).
## Warning: Removed 761 rows containing missing values (`geom_text()`).

wind_direction_1873_long %>% 
 mutate(wind_direction = ifelse(wind_direction == "NA", NA, wind_direction)) %>%
  mutate(period = factor(period, levels = c("am", "pm", "night"))) %>%
  drop_na(wind_direction, period, month) %>%
  ggplot(aes(x = wind_direction, fill = period)) +
  geom_bar(position = "stack", color = "black") +
  labs(title = "Bar Chart of Monthly Wind Directions Frequencies in 1873",
       x = "Wind Direction",
       y = "Frequency") +
  coord_flip() +
  facet_wrap(~ month, nrow = 2)

Wind Direction Comparison 1873-1874

wind_direction_1874_long <- journal_1874 %>% 
  pivot_longer(cols = starts_with("wind_direction"), 
               names_to = "period", 
               values_to = "wind_direction") %>% 
  separate_rows(wind_direction, sep = ", ") %>% 
  mutate(period = case_when(
    period == "wind_direction_am" ~ "am",
    period == "wind_direction_pm" ~ "pm", 
    period == "wind_direction_night" ~ "night"
  ))
wind_direction_1874_long <- wind_direction_1874_long %>%
  mutate(
    wind_degrees = case_when(
      period == "am" ~ convert_wind_direction_to_degrees(wind_direction),
      period == "pm" ~ convert_wind_direction_to_degrees(wind_direction),
      period == "night" ~ convert_wind_direction_to_degrees(wind_direction)
    )) %>% 
  select(date_mdy, month, wind_direction, period, wind_degrees) %>% 
  mutate(year = "1874") 
combined_wind_direction <- rbind(wind_direction_1873_long, wind_direction_1874_long)
combined_wind_direction %>% 
 mutate(wind_direction = ifelse(wind_direction == "NA", NA, wind_direction)) %>%
  drop_na(wind_direction, period, year) %>%
  ggplot(aes(x = wind_direction, fill = year)) +
  geom_bar(position = "stack", color = "black") +
  labs(title = "Bar Chart of Yearly Wind Directions Frequencies",
       x = "Wind Direction",
       y = "Frequency") +
  coord_flip() 

Wind Direction and Wind Speed

combined_wind <- full_join(combined_wind_direction, combined_wind_speed, by = c("date_mdy", "month", "year", "period"))
## Warning in full_join(combined_wind_direction, combined_wind_speed, by = c("date_mdy", : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 94 of `x` matches multiple rows in `y`.
## ℹ Row 409 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
combined_wind <- combined_wind %>% 
 mutate(wind_direction = ifelse(wind_direction == "NA", NA, wind_direction)) %>%
  mutate(wind_speed = ifelse(wind_speed  == "NA", NA, wind_speed )) %>%
           drop_na(wind_direction, wind_speed)
freq_combined_wind <- combined_wind %>%
  group_by(wind_speed, 
           wind_direction) %>%
  summarize(frequency = n())
## `summarise()` has grouped output by 'wind_speed'. You can override using the
## `.groups` argument.
freq_combined_wind %>% 
  ggplot(aes(x = wind_direction, 
             y = wind_speed, 
             fill = frequency)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Wind Speed and Wind Direction Heat Map",
       x = "Wind Direction",
       y = "Wind Speed",
       fill = "Frequency") +
    scale_fill_viridis_c(direction = -1) #for darker = higehr frequency
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

combined_wind %>%
  group_by(wind_speed, 
           wind_direction,
           year) %>%
  summarize(frequency = n()) %>% 
  ggplot(aes(x = wind_direction, 
             y = wind_speed, 
             fill = frequency)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Wind Speed and Wind Direction Heat Map by Year",
       x = "Wind Direction",
       y = "Wind Speed",
       fill = "Frequency") +
  scale_fill_viridis_c(direction = -1) +
  facet_wrap(~ year)
## `summarise()` has grouped output by 'wind_speed', 'wind_direction'. You can
## override using the `.groups` argument.
## Scale for fill is already present. Adding another scale for fill, which will
## replace the existing scale.

Weather Conditions

weather_con_1873_long <- journal_1873 %>%
  select(date_mdy,
         month,
         weather_condition_am, 
         weather_condition_pm, 
         weather_condition_night) %>% 
  mutate(year = "1873") %>% 
  pivot_longer(cols = starts_with("weather_condition"), 
               names_to = "period", 
               values_to = "weather_condition") %>%
  mutate(period = case_when(
    period == "weather_condition_am" ~ "am",
    period == "weather_condition_pm" ~ "pm", 
    period == "weather_condition_night" ~ "night")) %>% 
  separate_rows(weather_condition, sep = ",") %>% 
  mutate(category = case_when(
    weather_condition %in% c("chilly", "cold", "cool", "extremely cold", "very cold", "quite cold") ~ "cold",
    weather_condition %in% c("pleasant", "very pleasant") ~ "pleasant",
    weather_condition %in% c("very warm", "warm", "hot") ~ "warm",
    weather_condition %in% c("clear", "cleared up", "fine") ~ "clear",
    weather_condition %in% c("overcast", "cloudy") ~ "cloud",
    weather_condition %in% c( "pleasant", "quite pleasant", "very pleasant") ~ "pleasant",
    weather_condition %in% c("calm", "perfectly calm") ~ "calm",
    weather_condition %in% c("foggy", "foggy", "dense fog", "thick fog", "very foggy", "thick with fog", "very thick with fog") ~ "fog",
    weather_condition %in% c("heavy showers", "very heavy shower", "shower", "showery", "a little showery") ~ "showers",
    weather_condition %in% c("drizzle", "drizzling rain") ~ "drizzle",
    weather_condition %in% c("cold rain", "heavy rain storm", "heavy rain", "fine rain", "raining", "rainy", "rain", "rain spells", "rain squall", "rain storm", "moderate rain", "big rain storm", "very heavy rain") ~ "rain",
    weather_condition %in% c("little snow", "snow sleet", "light snow", "thick snow", "pleasant snow", "snowy", "snow", "snow spells", "snow squall", "big snow storm", "snow storm", "snowing", "snowing fast", "moderate snow") ~ "snow",
    weather_condition %in% c("stormy", "tough storm", "very heavy storm", "moderate rainstorm", "heavy storm") ~ "storm",
    weather_condition %in% c("thunder", "heavy thunder") ~ "thunder",
    weather_condition %in% c("sharp lightning", "lightning") ~ "lightning",
    weather_condition %in% c("misty", "good weather", "moderate weather", "sun out", "hail") ~ "other",
  ))
weather_con_1873_freq <- weather_con_1873_long %>% 
  mutate(weather_condition = ifelse(weather_condition == "NA", NA, weather_condition)) %>%
  drop_na(weather_condition) %>%
  count(weather_condition)

category_colors <- c(
  "cold" = "#16324a",
  "pleasant" = "#ccf146",
  "warm" = "#b5a642",
  "clear" = "#9467bd",
  "cloud" = "#8c564b",
  "calm" = "#2ca02c",
  "fog" = "#e377c2",
  "showers" = "#1f77b4",
  "drizzle" = "#004f95",
  "rain" = "#17becf",
  "snow" = "#bbbbbb",
  "storm" = "#ff7f0e",
  "thunder" = "#d62728",
  "lightning" = "#ff9896",
  "other" = "#c5b0d5"
)
wordcloud(
  weather_con_1873_freq$weather_condition,
  weather_con_1873_freq$n,
  colors = category_colors,
  random.order = FALSE,
  scale = c(5, 1),
  min.freq = 1,
  max.words = Inf
)

weather_con_1873_long %>% 
  mutate(weather_condition = ifelse(weather_condition == "NA", NA, weather_condition)) %>%
  drop_na( weather_condition, period, category) %>%
  group_by( weather_condition, period, category) %>%
  summarize(n = n()) %>%
  mutate(period = factor(period, levels = c("am", "pm", "night"))) %>%
  ggplot(aes(x =  weather_condition, 
             y = n, 
             fill = category)) +
  geom_bar(stat = "identity", 
           position = "dodge") +
  labs(title = "Weather Condition Frequency by Period of the Day 1873",
       x = "Weather Condition Vocabulary", 
       y = "Frequency", 
       fill = "Category") +
  theme_minimal() +
  coord_flip() +
  facet_wrap(~ period)
## `summarise()` has grouped output by 'weather_condition', 'period'. You can
## override using the `.groups` argument.

weather_con_1873_long %>%
  mutate(weather_condition = ifelse(weather_condition == "NA", NA, weather_condition)) %>%
  drop_na(weather_condition, category, month) %>%
  group_by(month) %>% 
  count(category) %>% 
  ggplot(aes(x = factor(month, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")), 
             y = n, 
             fill = category )) +
  geom_col(position = "stack") +
  scale_fill_brewer(palette = "Spectral") +
  labs(title = "Weather Frequency By Category per month",
       fill = "Caterogy",
        x = "Month",
       y = "Number of occurrences") +
  coord_polar()

weather_con_1874_long <- journal_1874 %>%
  select(date_mdy,
         month,
         weather_condition_am, 
         weather_condition_pm, 
         weather_condition_night) %>% 
  mutate(year = "1874") %>% 
  pivot_longer(cols = starts_with("weather_condition"), 
               names_to = "period", 
               values_to = "weather_condition") %>%
  mutate(period = case_when(
    period == "weather_condition_am" ~ "am",
    period == "weather_condition_pm" ~ "pm", 
    period == "weather_condition_night" ~ "night")) %>% 
  separate_rows(weather_condition, sep = ",") %>% 
  mutate(category = case_when(
    weather_condition %in% c("chilly", "cold", "cool", "extremely cold", "very cold", "quite cold") ~ "cold",
    weather_condition %in% c("pleasant", "very pleasant") ~ "pleasant",
    weather_condition %in% c("very warm", "warm", "hot") ~ "warm",
    weather_condition %in% c("clear", "cleared up", "fine") ~ "clear",
    weather_condition %in% c("overcast", "cloudy") ~ "cloud",
    weather_condition %in% c( "pleasant", "quite pleasant", "very pleasant") ~ "pleasant",
    weather_condition %in% c("calm", "perfectly calm") ~ "calm",
    weather_condition %in% c("foggy", "foggy", "dense fog", "thick fog", "very foggy", "thick with fog", "very thick with fog") ~ "fog",
    weather_condition %in% c("heavy showers", "very heavy shower", "shower", "showery", "a little showery") ~ "showers",
    weather_condition %in% c("drizzle", "drizzling rain") ~ "drizzle",
    weather_condition %in% c("cold rain", "heavy rain storm", "heavy rain", "fine rain", "raining", "rainy", "rain", "rain spells", "rain squall", "rain storm", "moderate rain", "big rain storm", "very heavy rain") ~ "rain",
    weather_condition %in% c("little snow", "snow sleet", "light snow", "thick snow", "pleasant snow", "snowy", "snow", "snow spells", "snow squall", "big snow storm", "snow storm", "snowing", "snowing fast", "moderate snow") ~ "snow",
    weather_condition %in% c("stormy", "tough storm", "very heavy storm", "moderate rainstorm", "heavy storm") ~ "storm",
    weather_condition %in% c("thunder", "heavy thunder") ~ "thunder",
    weather_condition %in% c("sharp lightning", "lightning") ~ "lightning",
    weather_condition %in% c("misty", "good weather", "moderate weather", "sun out", "hail") ~ "other",
  ))
weather_con <- rbind(weather_con_1873_long, weather_con_1874_long)

Wind Speed and Weather Conditions

#currently only have the 1873 weather
wind_weather <- full_join(combined_wind, weather_con, by = c("date_mdy", "month", "year", "period"))
## Warning in full_join(combined_wind, weather_con, by = c("date_mdy", "month", : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 104 of `x` matches multiple rows in `y`.
## ℹ Row 525 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
wind_weather  <- wind_weather  %>% 
 mutate(wind_direction = ifelse(wind_direction == "NA", NA, wind_direction)) %>%
  mutate(wind_speed = ifelse(wind_speed  == "NA", NA, wind_speed )) %>%
  mutate(weather_condition = ifelse(weather_condition  == "NA", NA, weather_condition )) %>%
           drop_na(wind_direction, wind_speed, weather_condition)
freq_wind_weather <- wind_weather %>%
  group_by(wind_speed, 
           weather_condition) %>%
  summarize(frequency = n())
## `summarise()` has grouped output by 'wind_speed'. You can override using the
## `.groups` argument.
freq_wind_weather %>% 
  ggplot(aes(x = weather_condition, 
             y = wind_speed, 
             fill = frequency)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Wind Speed and Weather Conditions Heat Map",
       x = "Weather Condition",
       y = "Wind Speed",
       fill = "Frequency") +
  coord_flip() +
  scale_fill_viridis_c(direction = -1) #for darker = higehr frequency
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

wind_weather %>% 
   group_by(wind_speed, 
           weather_condition,
           year) %>%
  summarize(frequency = n()) %>% 
  ggplot(aes(x = weather_condition, 
             y = wind_speed, 
             fill = frequency)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Wind Speed and Weather Conditions by Year Heat Map",
       x = "Weather Condition",
       y = "Wind Speed",
       fill = "Frequency") +
  facet_wrap(~ year, nrow = 2) +
  coord_flip() +
  scale_fill_viridis_c(direction = -1) #for darker = higehr frequency
## `summarise()` has grouped output by 'wind_speed', 'weather_condition'. You can
## override using the `.groups` argument.
## Scale for fill is already present. Adding another scale for fill, which will
## replace the existing scale.

Letters

journal_1873 <- journal_1873 %>%
  mutate(letter = case_when(
    letter == "read" ~ "received",
    letter == "read and write" ~ "received and write",
    TRUE ~ letter  # Keep the original value if none of the conditions match
  ))

journal_1874 <- journal_1874 %>%
  mutate(letter = case_when(
    letter == "read" ~ "received",
    letter == "read and write" ~ "received and write",
    TRUE ~ letter  # Keep the original value if none of the conditions match
  ))
letters_1873 <- journal_1873 %>%
  select(date_mdy, month, journal_entry, letter, letter_from, letter_to, notes, ) %>% 
  mutate(year = "1873")

letters_1874 <- journal_1874 %>%
  select(date_mdy, month, journal_entry, letter,letter_from, letter_to, notes) %>% 
  mutate(year = "1874")
combined_letters <- rbind(letters_1873, letters_1874)
combined_letters %>%
  count(letter, year) %>%
  mutate(letter = ifelse(letter == "NA", NA, letter)) %>%
  drop_na(letter) %>%
  ggplot(aes(x = letter, 
             y = n, 
             fill = year)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ year) +
  labs(title = "Letter Communication",
       subtitle = "For 1873 and 1874",
       x = "Letter Status",
       y = "Frequency",
       fill = "Year")

combined_letters %>%
  mutate(letter = ifelse(letter == "NA", NA, letter)) %>%
  drop_na(letter) %>%
  filter(letter != "no letter") %>%
  count(letter, year) %>%
  ggplot(aes(x = letter, y = n, fill = year)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ year) +
  labs(title = "Frequency of Letter Communication Mentioned",
       subtitle = "For 1873 and 1874",
       x = "Letter Status",
       y = "Frequency", 
       fill = "Year") +
  scale_y_continuous(breaks = seq(5, 30, 5))

month_order <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")

combined_letters %>%
  mutate(letter = ifelse(letter == "NA", NA, letter)) %>%
  drop_na(letter) %>%
  filter(letter != "no letter") %>%
  mutate(month = factor(month, levels = month_order)) %>%
  count(letter, year, month) %>%
  ggplot(aes(x = letter, y = n, fill = year)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ year) +
  labs(title = "Frequency of Letter Communication Mentioned per Month",
       subtitle = "For 1873 and 1874",
       x = "Letter Status",
       y = "Frequency",
       fill = "Year") +
  facet_wrap(~ month)

Communication (Names & Letters)

#only rows with letters written or received
communication <- combined_letters %>%
  mutate(letter = ifelse(letter == "NA", NA, letter)) %>%
  drop_na(letter) %>%
  filter(letter != "no letter") %>% 
  separate_rows(letter_from, sep = ", ") %>% 
  separate_rows(letter_to, sep = ", ") %>% 
  mutate(letter_from = case_when(
    letter_from %in% c("Perlley and Russel", "Perley and Russell") ~ "Perley and Russell",
    letter_from %in% c("G.L. Hodgkins", "G. L. Hodgkins", "G. L Hodgins") ~ "G. L. Hodgkins",
    letter_from %in% c("Charles C. Burrill", "C.C. Burrill", "C. C. Burrill", "C.C Burrill", "C.C Burill") ~ "C. C. Burrill",
    letter_from %in% c("Benj Kittridge", "Benj. Kittridge") ~ "Benj. Kittridge",
    letter_from %in% c("Reg. of Deeds", "Reg of Deed") ~ "Reg of Deeds",
    TRUE ~ letter_from)) %>% 
  mutate(letter_to = case_when(
    letter_to %in% c("C. C. Burrill", "C.C Burrill") ~ "C. C. Burrill",
    letter_to %in% c("Benj. Kittrige", "Benj. Kittridge") ~ "Benj. Kittridge",
    TRUE ~ letter_to))
communication %>% 
  select(letter_from, month, year) %>% 
  count(letter_from, month, year) %>% 
  drop_na(letter_from) %>% 
  group_by(month, letter_from) %>% 
  mutate(last_word = str_extract_all(letter_from, "\\S+") %>% map_chr(tail, 1)) %>%
  arrange(last_word) %>% # to arrange by alphabetical order based on last word
  ggplot(aes(x = factor(letter_from, levels = unique(letter_from)), 
             y = n, 
             fill = month)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Frequency of Letters Received by Freeland Bunker",
       subtitle = "In 1873 and 1874",
       x = "People Sending the Letters",
       y = "Frequency", 
       fill = "Month") +
  scale_fill_discrete(labels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")) +
  coord_flip() +
  facet_wrap(~ year)

communication %>% 
  select(letter_to, month, year) %>% 
  count(letter_to, month, year) %>% 
  drop_na(letter_to) %>% 
  group_by(month, letter_to, year) %>% 
  mutate(last_word = str_extract_all(letter_to, "\\S+") %>% map_chr(tail, 1)) %>%
  arrange(last_word) %>% #to arrange by alphabetical order
  ggplot(aes(x = factor(letter_to, levels = unique(letter_to)), 
             y = n, 
             fill = month)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Frequency of Letters Written by Freeland Bunker",
       subtitle = "In 1873 and 1874",
       x = "Letter Recipients",
       y = "Frequency", 
       fill = "Month") +
  scale_fill_discrete(labels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")) +
  coord_flip() +
  facet_wrap(~ year)